# -*- coding: utf-8 -*-

#【案例3-1】导入本地Excel数据表格
import pandas as pd
df = pd.read_excel(r"D:\DataAnalysis\Chapter03Data\Book.xlsx")
print(df)

#给出完整文件路径
import pandas as pd
df = pd.read_excel(r"D:\DataAnalysis\Chapter03Data\Book.xlsx")
print(df)

import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter03Data/Book.xlsx")
print(df)

#指定工作表名称
import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter03Data/Book.xlsx",
                   sheet_name = "BookData")
print(df)

#指定行索引和列索引
import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter03Data/Book.xlsx",
                   sheet_name = "BookData",index_col = 0,header = 0)
print(df)

#指定读取的列
import pandas as pd
df = pd.read_excel("D:/DataAnalysis/Chapter03Data/Book.xlsx",
                sheet_name = "BookData", usecols = [0,2])
print(df)


#【案例3-2】导入CSV格式表格数据实例
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter03Data/Book.csv")
print(df)

#导入以分号分割的CSV文件
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter03Data/BookSep.csv", sep = ";")
print(df)

#读取Book.csv文件中的前三行
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter03Data/Book.csv", nrows = 3)
print(df)

#import pandas as pd
#df = pd.read_csv("D:/DataAnalysis/Chapter03Data/BookGBK.csv", encoding = "gbk")
#print(df)
#
#import pandas as pd
#df = pd.read_csv("D:/数据分析/Chapter03Data/Book.csv", engine = "python", 
#                 encoding = "utf-8-sig")
#print(df)


#【案例3-3】导入红酒质量数据集
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter03Data/winequality-red.csv",
                 sep = ";", usecols = [0,1,2], nrows = 3)
print(df)


#【案例3-4】导入文本数据
import pandas as pd
df = pd.read_table("D:/DataAnalysis/Chapter03Data/Book.txt",sep = "\t")
print(df)


#【案例3-5】导入SQLServer数据库数据
from sqlalchemy import create_engine
import pandas as pd
#dialect：mssql
#driver：pymssql
#username：sa
#password：123
#host：localhost
#database：DataAnalysisBook
conn = create_engine("mssql+pymssql://sa:123@localhost/DataAnalysisBook")
#SQL查询语句
sql = "select * from Book"
df = pd.read_sql(sql, conn)
print(df)
#断开数据库连接
conn.dispose()


#【案例3-6】导入MySQL数据库数据
from sqlalchemy import create_engine
import pandas as pd
#dialect：mysql
#driver：pymysql
#username：root
#password：123
#host：localhost
#database：DataAnalysisBook
conn = create_engine("mysql+pymysql://root:123@localhost/DataAnalysisBook")
#SQL查询语句
sql = "select * from Book"
df = pd.read_sql(sql, conn)
print(df)
#断开数据库连接
conn.dispose()


#【案例3-7】爬取A股公司净利润排行榜
import pandas as pd
#爬取的网络地址
url = "https://s.askci.com/stock/a/?bigIndustryId=ci0000001534#QueryCondition"
#A股公司净利润排行榜位于所有表格的第二个，下标为1
df = pd.read_html(url)[1]
print(df)


#【案例3-8】爬取豆瓣读书Top250的图书书名
from bs4 import BeautifulSoup
import requests
#设置header，让服务器若识别为浏览器发来的请求
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
}
#豆瓣读书Top250页面的网址
url = "https://book.douban.com/top250?start=1"
#向豆瓣读书页面发送请求
re = requests.get(url,headers=header)
#对获取的页面进行解析
soup = BeautifulSoup(re.text,'lxml')
#获取页面中的图书标题内容的HTML标记
titles = soup.select('div.pl2 > a')
booktitles = []
#提取HTML标记中的图书名称，并存入booktitles列表中
for title in titles:
    title = title.text.strip().replace("\n","").replace(" ","")
    booktitles.append(title)


#【案例3-9】爬取豆瓣读书Top250的书籍图片    
from bs4 import BeautifulSoup
import requests
import urllib.request
#设置header，让服务器若识别为浏览器发来的请求
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36',
}
#豆瓣读书Top250页面的网址
url = "https://book.douban.com/top250?start=1"
#向豆瓣读书页面发送请求
re = requests.get(url,headers=header)
#对获取的页面进行解析
soup = BeautifulSoup(re.text,'lxml')
#获取页面中的图书标题内容和图片的HTML标记
titles = soup.select('div.pl2 > a')
imgs = soup.select('a.nbg img')
#提取HTML标记中的书名，并将下载的图片更名为书名
for title,img in zip(titles,imgs):
    title = title.text.strip().replace("\n","").replace(" ","").replace(":","")
    urllib.request.urlretrieve(img.get('src'), "D:/imgs/" + title + ".jpg")


#【案例3-10】查看红酒质量数据集部分数据
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter03Data/winequality-red.csv", sep = ";")
print(df.head(4))

#查看数据结构
print(df.shape)


#【案例3-11】查看红酒质量数据集数据类型
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter03Data/winequality-red.csv", sep = ";")
print(df.info())


#【案例3-12】查看红酒质量数据集数据分布
import pandas as pd
df = pd.read_csv("D:/DataAnalysis/Chapter03Data/winequality-red.csv", sep = ";")
print(df.describe())


#【案例3-13】数据获取和熟悉数据综合实例
import pandas as pd
#爬取的网络地址
url = "https://s.askci.com/data/economy/00012/"
#A股公司净利润排行榜位于所有表格的第二个，下标为1
df = pd.read_html(url)[0]
print(df.head(4))
print(df.info())
print(df.describe())